The data is cleaned for invalid months and preprocessed to feed it into the plotly functions.
library(plotly)
library(zoo)
data<-read.csv("~/Documents/Assignments/DA/week3_Assign/RainfallData.csv",header=TRUE)
data<-subset(data,substr(data$YYYYMM,5,6)!="13")
states = setNames(unique(subset(data,select = c("Description"))),c("State"))
rownames(states)=c()
dataset = unique(subset(data,select = c("YYYYMM")))
for(i in 1:nrow(states))
{
indata=setNames(subset(data,select=c("YYYYMM","Value"),Description==states[c(i),]),c("YYYYMM",as.character(states[c(i),])))
rownames(indata)=c()
dataset = merge(dataset,indata,by="YYYYMM")
}
dataset <- data.frame(lapply(dataset, function(x) {gsub("Not Available",0,x)}))
colnames(dataset)[3] <- "Tamil_Nadu"
colnames(dataset)[7] <- "Madhya_Pradesh"
colnames(dataset)[9] <- "West_Bengal"
colnames(dataset)[10] <- "Arunachal_Pradesh"
plot_ly(dataset, x = ~as.Date(as.yearmon(as.character(dataset$YYYYMM), "%Y%m"), frac = 1)) %>%
add_lines(y = ~as.vector(dataset$Karnataka), name = "Karnataka") %>%
add_lines(y = ~as.vector(dataset$Tamil_Nadu), name = "Tamil Nadu", visible = F) %>%
add_lines(y = ~as.vector(dataset$Kerala), name = "Kerala", visible = F) %>%
add_lines(y = ~as.vector(dataset$Orissa), name = "Orissa", visible = F) %>%
add_lines(y = ~as.vector(dataset$Maharashtra), name = "Maharashtra", visible = F) %>%
add_lines(y = ~as.vector(dataset$Madhya_Pradesh), name = "Madhya Pradesh", visible = F) %>%
add_lines(y = ~as.vector(dataset$Bihar), name = "Bihar", visible = F) %>%
add_lines(y = ~as.vector(dataset$West_Bengal), name = "West Bengal", visible = F) %>%
add_lines(y = ~as.numeric(dataset$Arunachal_Pradesh), name = "Arunachal pradesh", visible = F) %>%
layout(
title = "Rainfall in India",
xaxis = list(rangeslider = list(type = "date"),title="year"),
yaxis = list(title = "Rainfall"),
updatemenus = list(
list(
y = 0.3,
buttons = list(
list(method = "restyle",
args = list("visible", list(TRUE, FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)),
label = "Karnataka"),
list(method = "restyle",
args = list("visible", list(FALSE, TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)),
label = "Tamil Naidu"),
list(method = "restyle",
args = list("visible", list(FALSE, FALSE, TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE)),
label = "Kerala"),
list(method = "restyle",
args = list("visible", list(FALSE, FALSE, FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE)),
label = "Orissa"),
list(method = "restyle",
args = list("visible", list(FALSE, FALSE, FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE)),
label = "Maharashtra"),
list(method = "restyle",
args = list("visible", list(FALSE, FALSE, FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE)),
label = "Madya Pradesh"),
list(method = "restyle",
args = list("visible", list(FALSE, FALSE, FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE)),
label = "Bihar"),
list(method = "restyle",
args = list("visible", list(FALSE, FALSE, FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE)),
label = "West Bengal"),
list(method = "restyle",
args = list("visible", list(FALSE, FALSE, FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE)),
label = "Arunachal Pradesh")
))
)
)
A grouped bar graph as well as a line graph was plotted to understand the distribution of top 5 crops with respect to the extent of land on which they were harvested. The crop “Total foodgrain” is dropped due to a lot of N/A records being associated with respect to this entity. The grouped bar graphs is plotted to isolate each year and observe values for different crops in that year. Double click on the legend to isolate a year.
data<-read.csv(file="~/Documents/Assignments/DA/week3_Assign/TNAgri.csv", header=TRUE, sep=",")
data <- subset(data,Crop!="Total foodgrain")
crops_data = subset(data,select = c('Crop','Crop_Year','Area'))
Years_crops = aggregate(Area~Crop+Crop_Year,crops_data,sum)
top5_data = aggregate(Area~Crop,Years_crops,sum)
sorted_data = top5_data[with(top5_data, order(-Area)), ][c(1:5),]
interested_data <- data.frame(subset(Years_crops,Crop %in% sorted_data$Crop))
#bar plot
data1997 = subset(interested_data,Crop_Year == 1997)
data1998 = subset(interested_data,Crop_Year == 1998)
data1999 = subset(interested_data,Crop_Year == 1999)
data2000 = subset(interested_data,Crop_Year == 2000)
data2001 = subset(interested_data,Crop_Year == 2001)
data2002 = subset(interested_data,Crop_Year == 2002)
data2003 = subset(interested_data,Crop_Year == 2003)
data2004 = subset(interested_data,Crop_Year == 2004)
data2005 = subset(interested_data,Crop_Year == 2005)
data2006 = subset(interested_data,Crop_Year == 2006)
data2007 = subset(interested_data,Crop_Year == 2007)
data2008 = subset(interested_data,Crop_Year == 2008)
data2009 = subset(interested_data,Crop_Year == 2009)
data2010 = subset(interested_data,Crop_Year == 2010)
data2011 = subset(interested_data,Crop_Year == 2011)
data2012 = subset(interested_data,Crop_Year == 2012)
data2013 = subset(interested_data,Crop_Year == 2013)
barplt <-
plot_ly() %>%
add_trace(x = factor(data1997$Crop), y = data1997$Area,name=1997, type = 'bar',
text =data1997$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(164, 219, 194)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data1998$Crop), y = data1998$Area,name=1998, type = 'bar',
text =data1998$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(49, 249, 18)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data1999$Crop), y = data1999$Area,name=1999, type = 'bar',
text =data1999$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(249, 18, 249)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2000$Crop), y = data2000$Area,name=2000, type = 'bar',
text =data2000$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(30, 252, 248)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2001$Crop), y = data2001$Area,name=2001, type = 'bar',
text =data2001$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(183, 162, 0)',
line = list(color = 'rgb(8,48,107)', width =1.5))) %>%
add_trace(x = factor(data2002$Crop), y = data2002$Area,name=2002, type = 'bar',
text =data2002$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(252, 63, 30)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2003$Crop), y = data2003$Area,name=2003, type = 'bar',
text =data2003$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(10, 153, 65)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2004$Crop), y = data2004$Area,name=2004, type = 'bar',
text =data2004$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(198, 14, 239)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2005$Crop), y = data2005$Area,name=2005, type = 'bar',
text =data2005$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(255, 122, 45)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2006$Crop), y = data2006$Area,name=2006, type = 'bar',
text =data2006$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(223, 255, 45)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2007$Crop), y = data2007$Area,name=2007, type = 'bar',
text =data2007$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(45, 146, 255)',
line = list(color = 'rgb(19,38,17)', width = 1.5))) %>%
add_trace(x = factor(data2008$Crop), y = data2008$Area,name=2008, type = 'bar',
text =data2008$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(231, 196, 255)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2009$Crop), y = data2009$Area,name=2009, type = 'bar',
text =data2009$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(155, 48, 107)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2010$Crop), y = data2010$Area,name=2010, type = 'bar',
text =data2010$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(130, 190, 255)',
line = list(color = 'rgb(88,248,107)', width = 1.5))) %>%
add_trace(x = factor(data2011$Crop), y = data2011$Area,name=2011, type = 'bar',
text =data2011$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(160, 141, 99)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2012$Crop), y = data2012$Area,name=2012, type = 'bar',
text =data2012$Crop_Year , textposition = 'auto',
marker = list(color = 'rgb(88, 66, 229)',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
add_trace(x = factor(data2013$Crop), y = data2013$Area,name=2013, type = 'bar',
text =data2013$Crop_Year , textposition = 'auto',
marker = list(color = '#AED581',
line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
layout(title = "Analysis of trends for top 5 Crops over the years",
barmode = 'group',
yaxis = list(title = c("Area")))
barplt
#line plot
Groundnut <- subset(interested_data,Crop == 'Groundnut')
Jowar <- subset(interested_data,Crop == 'Jowar')
Rice <- subset(interested_data,Crop == 'Rice')
Sugarcane <- subset(interested_data,Crop == 'Sugarcane')
Urad<- subset(interested_data,Crop == 'Urad')
Years = unique(subset(interested_data,select = c('Crop_Year')))
lineplot <- plot_ly(type = 'scatter', mode = 'lines') %>%
add_trace(x=~Groundnut$Crop_Year, y =~Groundnut$Area, name = 'Groundnut', line = list(color = 'rgb(249, 151, 22)', width = 4)) %>%
add_trace(x=~Jowar$Crop_Year, y =~Jowar$Area, name = 'Jowar', line = list(color = 'rgb(128, 2, 255)', width = 4)) %>%
add_trace(x=~Rice$Crop_Year, y =~Rice$Area, name = 'Rice', line = list(color = 'rgb(15, 255, 15)', width = 4)) %>%
add_trace(x=~Sugarcane$Crop_Year, y =~Sugarcane$Area, name = 'Sugarcane', line = list(color = 'rgb(255, 15, 123)', width = 4)) %>%
add_trace(x=~Urad$Crop_Year, y =~Urad$Area, name = 'Urad', line = list(color = 'rgb(255, 255, 2)', width = 4)) %>%
layout(title = "Analysis of trends for top 5 Crops over the years",
xaxis = list(title = "Years"),
yaxis = list (title = "Area"))
lineplot
library(data.table)
graph<-read.csv("~/Documents/Assignments/DA/week3_Assign/GunViolence.csv",header=TRUE)
states_code<-data.frame(state.name, state.abb,stringsAsFactors = FALSE)
DT<-data.table(subset(graph,select=c(state,n_killed,n_injured)))
g<-data.table(subset(graph,select=c(incident_id,state)))
a=data.frame(g[, .(number_of_incidents = uniqueN(incident_id)), by=state])
b=data.frame(DT[, lapply(.SD,sum), by=state])
preprocess=merge(a,b)
df=merge(preprocess,states_code,by.x="state",by.y="state.name")
df$hover <- with(df, paste(state, "<br>",
"Number of kills", n_killed,
"<br>", "Number of injuries",n_injured))
l <- list(color = toRGB("white"), width = 2)
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
plot_geo(df, locationmode = 'USA-states') %>%
add_trace(
z = ~number_of_incidents, text = ~hover, locations = ~state.abb,
color = ~number_of_incidents, colors = 'Reds'
) %>%
colorbar(title = "Count") %>%
layout(
title = 'Crimes in USA by State<br>(Hover for breakdown)',
geo = g
)
Two word clouds and one pie chart each for a gender is drawn. The first word cloud represents the resolution topics while the second one is created from the tweet data. They are preprocessed using stemming and the stop words are removed. The pie charts help to represent the share of each category in the whole data.
setwd('~/Documents/Assignments/DA/week3_Assign/')
data<-read.csv(file="NewYearResolution15.csv", header=TRUE, sep=",")
male_data = subset(data,select = c("name","gender","Resolution_Category"),gender=="male")
female_data = subset(data,select = c("name","gender","Resolution_Category"),gender=="female")
female_count = setNames(aggregate(gender~Resolution_Category,female_data,length),c("Category","Count"))
male_count = setNames(aggregate(gender~Resolution_Category,male_data,length),c("Category","Count"))
#pie charts
colors <- c('rgb(229, 90, 66)','rgb(125, 229, 66)','rgb(66, 229, 220)','rgb(229, 66, 223)','rgb(93, 66, 229)','rgb(255, 251, 20)','rgb(20, 255, 184)','rgb(184, 119, 214)','rgb(202, 214, 119)','rgb(168, 127, 97)')
female_plot <-plot_ly(female_count,labels = ~Category, values = ~Count,
textposition = 'inside',
textinfo = 'percent',
insidetextfont = list(color = '#00000'),
marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1))) %>%
add_pie(hole = 0.3) %>%
layout(title = "Resolution Category Proportion (Female)", showlegend = T,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
female_plot
male_plot <-plot_ly(male_count,labels = ~Category, values = ~Count,
textposition = 'inside',
textinfo = 'percent',
insidetextfont = list(color = '#00000'),
marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1))) %>%
add_pie(hole = 0.3) %>%
layout(title = "Resolution Category Proportion (Male)", showlegend = T,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
male_plot
#install.packages("tm") # for text mining
#install.packages("SnowballC") # for text stemming
#install.packages("wordcloud") # word-cloud generator
#install.packages("RColorBrewer") # color palettes
library(tm)
library(SnowballC)
library(RColorBrewer)
library(wordcloud)
#resolution_topics wordcloud
topic.Corpus<-Corpus(VectorSource(data$resolution_topics))
topic.Clean<-tm_map(topic.Corpus, PlainTextDocument)
topic.Clean<-tm_map(topic.Corpus,tolower)
topic.Clean<-tm_map(topic.Clean,removeNumbers)
topic.Clean<-tm_map(topic.Clean,removeWords,stopwords("english"))
topic.Clean<-tm_map(topic.Clean,removePunctuation)
topic.Clean<-tm_map(topic.Clean,stripWhitespace)
dtm <- TermDocumentMatrix(topic.Clean)
rtm <- as.matrix(dtm)
rtv <- sort(rowSums(rtm),decreasing=TRUE)
rtd <- data.frame(word = names(rtv),freq=rtv)
wordcloud(words = rtd$word,freq = rtd$freq,max.words =200 ,random.color = TRUE,random.order=FALSE,rot.per=0.55,
colors=brewer.pal(8, "Dark2"))
#tweet text wordcloud
#text.Corpus<-Corpus(VectorSource(data$text))
text.Corpus<-Corpus(VectorSource(gsub("#NewYearsResolution", "\\1", data$text)))
text.Clean<-tm_map(text.Corpus, PlainTextDocument)
text.Clean<-tm_map(text.Corpus,tolower)
text.Clean<-tm_map(text.Clean,removeNumbers)
text.Clean<-tm_map(text.Clean,removeWords,stopwords("english"))
text.Clean<-tm_map(text.Clean,removePunctuation)
text.Clean<-tm_map(text.Clean,stripWhitespace)
dtm <- TermDocumentMatrix(text.Clean)
tm <- as.matrix(dtm)
tv <- sort(rowSums(tm),decreasing=TRUE)
td <- data.frame(word = names(tv),freq=tv)
wordcloud(words = td$word,freq = td$freq,max.words =600 ,random.color = TRUE,random.order=FALSE,rot.per=0.15,
colors=brewer.pal(8, "Dark2"))